Load packages

library(tidyverse)
library(skimr)
library(caret)
library(recipes)

Read in data from CSV file

airsat <- read_csv("https://osf.io/7sqcn/download")
airsat

Create training and testing sets

set.seed(2021)
index <- createDataPartition(airsat$satisfaction, p = 0.8, list = FALSE)
airsat_train <- airsat[index, ]
airsat_test <- airsat[-index, ]

dim(airsat_train)
#> [1] 8001   23
dim(airsat_test)
#> [1] 1999   23

Explore the training set

skim(airsat_train)
Data summary
Name airsat_train
Number of rows 8001
Number of columns 23
_______________________
Column type frequency:
character 5
numeric 18
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
satisfaction 0 1 9 12 0 2 0
sex 0 1 4 6 0 2 0
customer_type 0 1 5 8 0 2 0
travel_type 0 1 8 8 0 2 0
class 0 1 3 8 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 39.52 14.95 7 27 40 51 85 ▃▇▇▅▁
flight_distance 0 1 1987.42 1037.12 50 1364 1918 2559 6595 ▃▇▃▁▁
seat_comfort 0 1 2.88 1.39 0 2 3 4 5 ▇▇▇▇▅
time_convenience 0 1 2.99 1.52 0 2 3 4 5 ▇▆▆▇▇
food_drink 0 1 2.90 1.43 0 2 3 4 5 ▇▇▇▇▆
gate_location 0 1 3.00 1.31 1 2 3 4 5 ▅▆▇▇▅
inflight_wifi 0 1 3.25 1.32 0 2 3 4 5 ▃▇▇▇▇
inflight_entertainment 0 1 3.42 1.33 0 3 4 4 5 ▂▃▅▇▆
online_support 0 1 3.53 1.31 1 3 4 5 5 ▃▃▅▇▇
ease_booking 0 1 3.48 1.31 1 2 4 5 5 ▃▅▅▇▇
onboard_service 0 1 3.48 1.27 1 3 4 4 5 ▂▃▅▇▆
leg_room 0 1 3.48 1.30 0 2 4 5 5 ▂▅▅▇▇
baggage_handling 0 1 3.70 1.16 1 3 4 5 5 ▂▂▃▇▆
checkin_service 0 1 3.34 1.27 1 3 3 4 5 ▃▃▇▇▆
cleanliness 0 1 3.70 1.15 1 3 4 5 5 ▁▂▃▇▆
online_boarding 0 1 3.35 1.31 1 2 4 4 5 ▃▅▇▇▇
departure_delay 0 1 14.32 36.88 0 0 0 12 569 ▇▁▁▁▁
arrival_delay 34 1 14.56 37.15 0 0 0 13 543 ▇▁▁▁▁

Create and prep recipe

airsat_recipe <- 
  airsat %>% 
  recipe(satisfaction ~ .) %>% 
  step_nzv(all_predictors()) %>% 
  step_lincomb(all_numeric_predictors()) %>% 
  step_normalize(all_numeric_predictors()) %>% 
  step_pca(all_numeric_predictors(), threshold = 0.9) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  prep(training = airsat_train, log_changes = TRUE)
#> step_nzv (nzv_AY9ua): 
#>  removed (2): departure_delay, arrival_delay
#> 
#> step_lincomb (lincomb_AySWE): same number of columns
#> 
#> step_normalize (normalize_V6vGz): same number of columns
#> 
#> step_pca (pca_wPZrK): 
#>  new (11): PC01, PC02, PC03, PC04, PC05, PC06, PC07, PC08, PC09, PC10, ...
#>  removed (16): age, flight_distance, seat_comfort, time_convenience, ...
#> 
#> step_dummy (dummy_5NO9d): 
#>  new (5): sex_Male, customer_type_loyal, travel_type_Personal, class_Eco, ...
#>  removed (4): sex, customer_type, travel_type, class

Bake new training set

airsat_baked_train <- bake(airsat_recipe, new_data = airsat_train)
airsat_baked_train

Bake new testing set

airsat_baked_test <- bake(airsat_recipe, new_data = airsat_test)
airsat_baked_test

Hands-on Activity

Modify the code above to accomplish the following goals:

  1. Use 75% of the data for training and 25% of the data for testing.

  2. Apply the Yeo-Johnson transformation to the flight_distance variable (before normalizing it).

  3. Instead of using PCA to address multicollinearity, drop highly correlated predictors.

  4. Use one-hot encoding for the nominal predictors instead of dummy codes.

  5. Add an interaction term that conditions seat_comfort on flight_distance

BONUS: Read the “Recommended preprocessing” appendix (https://www.tmwr.org/pre-proc-table.html)

FURTHER READING: https://www.tmwr.org/recipes.html (Chapter) https://bookdown.org/max/FES/ (Book)


Answer key

Click here to view the answer key to the hands-on activity
set.seed(2021)
index <- createDataPartition(airsat$satisfaction, p = 0.75, list = FALSE) #1
airsat_train <- airsat[index, ]
airsat_test <- airsat[-index, ]
airsat_recipe <- 
  airsat %>% 
  recipe(satisfaction ~ .) %>% 
  step_nzv(all_predictors()) %>% 
  step_lincomb(all_numeric_predictors()) %>% 
  step_YeoJohnson(flight_distance) %>%                      #2
  step_normalize(all_numeric_predictors()) %>% 
  step_corr(all_numeric_predictors()) %>%                   #3 
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%  #4 
  step_interact(~ seat_comfort:flight_distance) %>%         #5
  prep(training = airsat_train, log_changes = TRUE)
#> step_nzv (nzv_RiYtU): 
#>  removed (2): departure_delay, arrival_delay
#> 
#> step_lincomb (lincomb_fnnfs): same number of columns
#> 
#> step_YeoJohnson (YeoJohnson_s6zuF): same number of columns
#> 
#> step_normalize (normalize_gMFXy): same number of columns
#> 
#> step_corr (corr_7QqN7): same number of columns
#> 
#> step_dummy (dummy_NMHen): 
#>  new (9): sex_Female, sex_Male, customer_type_disloyal, ...
#>  removed (4): sex, customer_type, travel_type, class
#> 
#> step_interact (interact_b1gwO): 
#>  new (1): seat_comfort_x_flight_distance